### Load packages
options(scipen = 999)
library(data.table)
library(magrittr)
source('~/Documents/R/Utils/functions/feature_comparison.R')
source('~/Documents/R/Utils/functions/psi.R')
source('~/Documents/R/Utils/functions/helper.R')
### Input data & parameters
# input: a data.table / list of data.tables with features of interest
input = fread("~/Documents/R/churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
inputGood <- input[Churn=="No"]
inputBad <- input[Churn=="Yes"]
# featureList: names of the features to display distribution over time
# the following code removes any variables with >35 unique values
featureList <- names(input)[input[, lapply(.SD, uniqueN) <= 35, .SDcols = names(input)] |
as.logical(input[, lapply(.SD, is.numeric), .SDcols = names(input)])]
featureList <- setdiff(featureList, c("customerID", "Churn"))
- Last updated on 2018-09-08
- Features customerID, Churn are removed from feature distributions
psi <- purrr::map(featureList,
function(x) PSI(oldScore = inputGood[, get(x)],
newScore = inputBad[, get(x)],
dataNames = c("No", "Yes"),
quantile = quantile,
showVis = F))
plot <- lapply(1L:length(featureList),
function(x) list(htmltools::tags$h3(featureList[x]),
htmltools::tags$h5(paste0("PSI: ", psi[x])),
FeatureComparison(input = list(NoChurn = inputGood[, get(featureList[x])],
Churn = inputBad[, get(featureList[x])]),
vectorName = featureList[x]),
htmltools::tags$hr()))
htmltools::tagList(plot[rev(order(purrr::map_dbl(psi, 1)))])
Contract
PSI: 1.19
OnlineSecurity
PSI: 0.77
TechSupport
PSI: 0.75
InternetService
PSI: 0.61
DeviceProtection
PSI: 0.54
OnlineBackup
PSI: 0.51
PaymentMethod
PSI: 0.46
StreamingMovies
PSI: 0.41
StreamingTV
PSI: 0.38
PaperlessBilling
PSI: 0.2
Dependents
PSI: 0.15
Partner
PSI: 0.1
SeniorCitizen
PSI: 0.09
MultipleLines
PSI: 0
PhoneService
PSI: 0
gender
PSI: 0
TotalCharges
PSI: -1
MonthlyCharges
PSI: -1
tenure
PSI: -1